This dataset is to classify whether or not a patient has a certain unspecified disease according to his/her part of information. I used decision tree, logistic regression, multiple artificial neural networks and deep learning models to solve this problem. After validating model performance and tuning model.Finally, got the prediction for each model.
Attributes’ information about the dataset (Disease Prediction Training.csv):
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, KFold,train_test_split,cross_val_score, cross_validate, ShuffleSplit, LeaveOneOut
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(33)
df = pd.read_csv("./Disease Prediction Training.csv")
df.head()
df.shape
df.dtypes
df.isnull().sum()
df.describe()
df.info()
df.head()
ax = sns.distplot(df["Age"])
fig = px.violin(df, y="High Blood Pressure", x="Glucose", color="Gender", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.violin(df, y="Low Blood Pressure", x="Glucose", color="Gender", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.violin(df, y="Height", x="Cholesterol", color="Gender", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.violin(df, y="Age", x="Glucose", color="Gender", box=True, points="all",
hover_data=df.columns)
fig.show()
def blood_pressure(x):
if x < 0:
x = abs(x)
elif x > 0 and x <30:
x = x * 10
elif x> 300 and x <= 2000:
x = int(x/10)
elif x > 2000:
x = int(x/100)
else:
x = x
return x
df["Low Blood Pressure"] = df["Low Blood Pressure"].apply(blood_pressure)
df["High Blood Pressure"] = df["High Blood Pressure"].apply(blood_pressure)
df = df[df['Low Blood Pressure'] > 0]
df["Low Blood Pressure"]= df.apply(lambda x: 90 if x["Low Blood Pressure"]>90 else x, axis=1)
def CheckHigh(x):
if x <120:
x = 120
return x
else:
return x
df["High Blood Pressure"]= df["High Blood Pressure"].apply(CheckHigh)
def switchA(a,b):
c = 0
if a < b:
c = b
a = c
return a
else:
return a
df["High Blood Pressure"] = df.apply(lambda x: switchA(x["High Blood Pressure"],x["Low Blood Pressure"]),axis=1)
ax = sns.distplot(df["High Blood Pressure"])
ax = sns.distplot(df["Low Blood Pressure"])
catorical_list = ['Gender','Cholesterol','Glucose']
def cat2num(df):
for i in catorical_list:
df = df.join(pd.get_dummies(df[i],prefix=i+'_'))
df = df.drop([i],axis=1)
return df
df =cat2num(df)
X = df.drop(['Disease'],axis = 1)
y = df['Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
scaler.fit(X)
X_std = scaler.transform(X)
I found this dataset doesn't have any missing value, but its bloodpressure has a lot of problems by violin plots, some of them are over reasonable value, so I used apply function in pandas to make them more sense, and set the highest value of LowBloodPressure is 90, the lowest value of HighBloodPressure is 120 to make sure bloodpressure is in a reasonable and scientific range. Then making catorical data into numerical by one hot encoding, and splitted this dataset into training data and testing data.
def model_performance(y_test, y_pred):
print(classification_report(y_test, y_pred, target_names=["Disease","No Disease"]))
def roc_chart(model,label):
y_predict_proba = model.predict_proba(X_test_std)
fpr, tpr, threshold = roc_curve(y_test, pd.DataFrame(y_predict_proba)[1])
plt.plot([0,1],[0,1],'r--',label='Ramdom Guess')
plt.plot(fpr, tpr, linestyle='solid', linewidth=2, label=label)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.title(label+' ROC')
plt.show()
print('AUC is:',auc(fpr, tpr))
def ann_plot(model):
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
ax1.plot(model.history['loss'])
ax1.set_title('model loss',loc='center')
ax1.set_ylabel('loss ')
ax1.set_xlabel('epoch')
ax1.legend(['train', 'test'], loc='upper left')
ax2.plot(model.history['binary_accuracy'])
ax2.set_title('binary_accuracy',loc='center')
ax2.set_ylabel('binary_accuracy')
ax2.set_xlabel('epoch')
ax2.legend(['train', 'test'], loc='upper left')
# plt.tight_layout()
plt.show()
logr_pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=33))
logr_pipe.fit(X_train_std, y_train)
y_pred = logr_pipe.predict(X_test_std)
model_performance(y_test, y_pred)
lr_time = time.time()
cv = KFold(n_splits=10, shuffle=True, random_state=33)
from sklearn.model_selection import GridSearchCV
lr_param_grid = {'penalty': ['l1','l2'],
'solver': ['liblinear','saga'],
'C': np.linspace(0.01,1.01,9)}
grid = GridSearchCV(LogisticRegression(random_state=33)
, lr_param_grid,cv=cv)
lr_grid = grid.fit(X_train_std,y_train)
y_pred = lr_grid.predict(X_test_std)
best_lr = lr_grid.best_estimator_
lr_cost = time.time()-lr_time
print(lr_grid.best_score_)
print(lr_grid.best_params_)
print(f'the best model of Logistic regression needs {time.strftime("%H:%M:%S", time.gmtime(lr_cost))} to run.')
roc_chart(lr_grid.best_estimator_,'Logistic Regression Grid')
model_performance(y_test, y_pred)
pd.DataFrame({'Features':X.columns,'Coefficients':best_lr.coef_.ravel()}).sort_values("Coefficients", ascending=False)
By tuning logistic regression hyper-parameters, such as solver, penalty and C,I got a best logistic regression model for this dataset. The ROC and AUC shows a good result, in addition, High Blood Pressure, Age, Cholesterol__too high are the top 3 factors which affect if a person got disease or not.
I set the loss function is binary_crossentropy and metrcs is binary_accuracy because this problem is a binary classfication, so using binary_crossentropy and binary_accuracy can help get a good performance. I didn't do a regularization because all of the three model did not overfit according to evaluate training data.
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
#from tensorflow.keras.regularizers import l2
ANN0_model = Sequential()
ANN0_model.add(Dense(1, activation='sigmoid',input_shape=(len(X_train.columns),)))
ANN0_model.output_shape
np.__version__
The reason why I got a lot of warnings is because the version of numpy is lastest, it would disappear if the version of numpy is under 1.16. reference is here: https://github.com/tensorflow/tensorflow/issues/30427
ANN0_model.summary()
ANN0_model.compile(loss='binary_crossentropy'
, optimizer='sgd'
, metrics=['binary_accuracy'
])
ANN0_model.fit(X_train_std, y_train, epochs=10, batch_size=128, verbose=1)
start = time.time()
ANN0_model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['binary_accuracy'])
ANN0 = ANN0_model.fit(X_train_std, y_train, epochs=15, batch_size=128, verbose=1)
ann0_cost = time.time() - start
ann0_params= {'epochs':20,'batch_size':128,
'loss':'binary_crossentropy',
'optimizer':'RMSprop','metrics':'binary_accuracy'}
print(f'the best model of ANN0 needs {time.strftime("%H:%M:%S", time.gmtime(ann0_cost))} to run.')
ann_plot(ANN0)
checkover = ANN0_model.evaluate(X_train_std, y_train, batch_size=128,verbose=1)
ann0_score = ANN0_model.evaluate(X_test_std, y_test, batch_size=128)
y_pred_keras = ANN0_model.predict_proba(X_train_std).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_train, y_pred_keras)
ANN0_score = auc(fpr_keras, tpr_keras)
print(ANN0_score)
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(ANN0_score))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve')
plt.legend(loc='best')
I changed input nodes to be more larger and smaller, I did not see anything improvement. And I tried to change optimization function, but I didn't found there is anything improvement and so epoch and batchsize too. By using evaluation function, the binary accuracy of this model is 0.7295. I used AUC to check this result, it is 0.7965.
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
ANN1_model = Sequential()
ANN1_model.add(Dense(16, activation='relu', input_shape=(len(X_train.columns), )))
ANN1_model.add(Dense(1, activation='sigmoid'))
ANN1_model.output_shape
ANN1_model.summary()
ANN1_model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['binary_accuracy'])
ANN1_model.fit(X_train_std, y_train, epochs=20, batch_size=128, verbose=0)
ANN1_model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['binary_accuracy'])
ANN1 = ANN1_model.fit(X_train_std, y_train, epochs=25, batch_size=128, verbose=1)
ann_plot(ANN1)
start = time.time()
ANN1_model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['binary_accuracy'])
ANN1 = ANN1_model.fit(X_train_std, y_train, epochs=30, batch_size=128, verbose=1)
ann1_cost = time.time() - start
print(f'the best model of ANN1 needs {time.strftime("%H:%M:%S", time.gmtime(ann1_cost))} to run.')
ann1_params= {'epochs':30,'batch_size':128,
'loss':'binary_crossentropy',
'optimizer':'RMSprop','metrics':'binary_accuracy'}
checkover = ANN1_model.evaluate(X_train_std, y_train, batch_size=128)
ann1_score = ANN1_model.evaluate(X_test_std, y_test, batch_size=128)
y_pred_keras = ANN1_model.predict_proba(X_train_std).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_train, y_pred_keras)
ANN1_score = auc(fpr_keras, tpr_keras)
print(ANN1_score)
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(ANN1_score))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve')
plt.legend(loc='best')
When I added one hidden layer, I changed the optimization into RMSprop Adadelta and adam, and I found there is a little improvement in ANN1 Model. When I changed the value of epoch, I found it would be overfitted if the value is more than 25, so I set 25 epoches for this model. In addition, the input node is 32 and the only one hidden layer is 32 can help me get a better performance according to tune. At last, the accuracy is 0.7375, and its AUC can reach 0.8060.
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
ANN2_model = Sequential()
ANN2_model.add(Dense(16, activation='relu', input_shape=(len(X_train.columns), )))
ANN2_model.add(Dense(8, activation='relu'))
ANN2_model.add(Dense(1, activation='sigmoid'))
ANN2_model.output_shape
ANN2_model.summary()
start = time.time()
ANN2_model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['binary_accuracy'])
ANN2 = ANN2_model.fit(X_train_std, y_train, epochs=50, batch_size=128, verbose=1)
ann2_cost = time.time() - start
print(f'the best model of ANN2 needs {time.strftime("%H:%M:%S", time.gmtime(ann2_cost))} to run.')
ann2_params= {'epochs':30,'batch_size':128,
'loss':'binary_crossentropy',
'optimizer':'adam','metrics':'binary_accuracy'}
ann_plot(ANN2)
checkover = ANN2_model.evaluate(X_train_std, y_train, batch_size=128)
ann2_score = ANN2_model.evaluate(X_test_std, y_test, batch_size=128)
y_pred_keras = ANN2_model.predict_proba(X_train_std).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_train, y_pred_keras)
ANN2_score = auc(fpr_keras, tpr_keras)
print(ANN2_score)
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(ANN2_score))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC curve')
plt.legend(loc='best')
In this model architecture, when I added two hidden layer, I found the first hidden is 16, the second layer is 8, epoch is 30, this model can get a better performance, and in the meanwhile, this model won't be overfitted. Its accuracy is 0.7380, and its AUC can reach 0.8058.
dt_pipe = make_pipeline(StandardScaler(), DecisionTreeClassifier())
dt_pipe.fit(X_train_std, y_train)
y_pred = dt_pipe.predict(X_test_std)
model_performance(y_test, y_pred)
dt_start = time.time()
param = {'criterion':['gini','entropy']
,'max_depth':range(2,8,1)
,'min_samples_leaf':range(1,10,2)
,'min_impurity_decrease':np.arange(0.01,0.5,0.05)
}
kfold = KFold(n_splits=10)
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param, cv=kfold)
dt_grid = grid.fit(X_train_std,y_train)
print('best_param:',dt_grid.best_params_,'best_score:', dt_grid.best_score_)
y_pred = dt_grid.predict(X_test_std)
best_dt = dt_grid.best_estimator_
dt_cost = time.time()-dt_start
print(f'the best model of Decision Tree needs {time.strftime("%H:%M:%S", time.gmtime(dt_cost))} to run.')
roc_chart(dt_grid,'Logistic Regression')
model_performance(y_test, y_pred)
dt_cof = pd.DataFrame({'Features':X.columns,'DT_Importance':best_dt.feature_importances_}).sort_values("DT_Importance", ascending=False)
dt_cof
rf_param_grid = {'n_estimators': range(80, 140,20),
'max_depth':range(1, 10),
'max_features': range(3, 5)}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param_grid)
rf_grid = rf_grid.fit(X_train_std, y_train)
roc_chart(rf_grid,'Random Forest')
rf_fi = pd.DataFrame({'Features':X.columns,'RF_Importance':rf_grid.best_estimator_.feature_importances_}).sort_values("RF_Importance", ascending=False)
param_grid = {'learning_rate': np.arange(0.02, 0.1, 0.01),
'n_estimators': range(80, 100,20),
'max_features':range(3, 8),
'max_depth': range(2, 5)}
gbt_grid = GridSearchCV(GradientBoostingClassifier(), param_grid)
gbt_grid.fit(X_train_std, y_train)
roc_chart(gbt_grid,'Gradient Boosting')
gbt_fi = pd.DataFrame({'Features':X.columns,'GBT_Importance':gbt_grid.best_estimator_.feature_importances_}).sort_values("GBT_Importance", ascending=False)
feature_compared = pd.concat([dt_cof,rf_fi['RF_Importance'],gbt_fi['GBT_Importance']],axis=1)
feature_compared.sort_values('GBT_Importance',ascending=False).head()
compared_df = pd.DataFrame([dt_grid.best_score_],index=['Score'],columns=['DT'])
compared_df['RF'] = rf_grid.best_score_
compared_df['GBT'] = gbt_grid.best_score_
compared_df
There is a obviously result that the top 3 features importance of all of them are the same, they are High Blood Pressure, Low Blood Pressure,Age. Random Forest have a better performance than Decision Tree because Random Forest would build many trees. As for Gradient Boosting, it uses gradient descent to optimize the model, so Gradient Boosting Tree can get a better performance than Random Forest.
They have a similar performance. For logistic regression and single layer perceptron, they are also very similar in architecture, both two used sigmoid function to classify, the reason why they have different performance is because Logistic regression models a function of the mean of a Bernoulli distribution as a linear equation, but Perceptron doesn't use probabilistic assumptions for neither the model nor its parameter. As for Linear SVM and LR, both two are linear classfier, but SVM bases on distance and LR depends on probability, so SVM doesn't affect by data distribution, but LR does
def table(name, param, score, cost):
data = {'Name':name
,'Key hyperparameters tuned':param
,'Model performance':score
,'Estimate of time(n=nums,d=dimension,k=neighbor nums)':cost
}
index = data.keys()
df = pd.Series([name, param, score, cost], index=index)
return df
lr_table = table('Logistics Regression',lr_grid.best_params_,lr_grid.best_score_,'O(nd)')
ann0_table = table('ANN0',ann0_params,ann0_score[1],'-')
ann1_table = table('ANN1',ann1_params,ann1_score[1],'-')
ann2_table = table('ANN2',ann2_params,ann2_score[1],'-')
dr_table = table('Decision Tree',dt_grid.best_params_,dt_grid.best_score_,'O(n*log(n)*d)')
hw4_table = pd.concat([lr_table,dr_table,ann0_table,ann1_table,ann2_table],axis=1).T
hw4_table
hw3_table = pd.read_csv('./hw3_table.csv')
index = ['Name','Key hyperparameters tuned','Model performance','Estimate of time']
table = pd.concat([hw3_table,hw4_table],axis=0,ignore_index=True,sort=False)
table =table.drop(["Unnamed: 0"],axis=1).reindex()
table.sort_values("Model performance", ascending =False)
testing = pd.read_csv('./Disease Prediction Testing.csv')
testing_data = pd.read_csv('./Disease Prediction Testing.csv')
testing = cat2num(testing)
testing = testing.drop(['ID'],axis=1)
scaler = StandardScaler()
scaler.fit(testing)
testing_std = scaler.transform(testing)
dt_pred = best_dt.predict(testing_std)
lr_pred = lr_grid.predict(testing_std)
ANN0_pred = ANN0_model.predict_classes(testing_std)
ANN1_pred = ANN1_model.predict_classes(testing_std)
ANN2_pred = ANN2_model.predict_classes(testing_std)
result = pd.DataFrame({'ID':testing_data.ID
,'Decision Tree':dt_pred
,'Logistic Regression':lr_pred
,'ANN0':ANN0_pred.ravel()
,'ANN1':ANN1_pred.ravel()
,'ANN2':ANN2_pred.ravel()
})
result
result.describe()
result.to_csv('homework_4_YueyuanHe_results.csv')